"""Load datasets used by gender and genre section figures and models."""

import hashlib
import os
import random

from sklearn import feature_extraction
from sklearn import feature_selection
import pandas as pd


DATA_PATH = os.path.join(os.path.dirname(__file__), '..', 'data')
NOVELS_DATA_PATH = os.path.join(DATA_PATH, 'novels')
NOVELS_PROJECT_FILENAME = os.path.join(NOVELS_DATA_PATH, 'novels-project-works.json')


def novels():
    """Load bibliographic records from RFGS."""

    ######################################################################
    # load data, restrict to years of interest
    ######################################################################
    df = pd.read_json(NOVELS_PROJECT_FILENAME, orient='index')
    df.index = df.index.values.astype(int)
    df = df[(df['year'] >= 1800) & (df['year'] < 1830)]

    # manually add visible gender for these authors in updates
    df.loc[2899, 'gender'] = 'Male'
    df.loc[2900, 'gender'] = 'Female'
    df.loc[2901, 'gender'] = 'Unknown'
    df.loc[2902, 'gender'] = 'Unknown'

    # fix publication/publisher swap
    df.loc[2899, 'publication'] = df.loc[2899, 'publisher']
    df.loc[2900, 'publication'] = df.loc[2900, 'publisher']
    df.loc[2901, 'publication'] = df.loc[2901, 'publisher']
    df.loc[2902, 'publication'] = df.loc[2902, 'publisher']

    # sanity checks
    assert df['gender'].notnull().all()
    assert len(df['gender'].unique()) == 3
    assert df['publication'].notnull().all()

    # in cases where author is missing, use translator
    df.loc[df['author'].isnull(), 'author'] = df.loc[df['author'].isnull(), 'translator']
    assert df['author'].notnull().all()
    return df.sort_index()


def dataset():
    """Minimally processed data.

    Returns a triple: DataFrame with metadata, document-term matrix, and a tuple
    of vocabulary elements.

    """
    df = novels()[['year', 'gender', 'title']]

    # NOTES:
    # - sklearn, by default, ignores single-letter words
    # - note the value associated with min_df
    # - sklearn, by default, lowercases words
    # - stemming does not help significantly
    vec_unigrams = feature_extraction.text.CountVectorizer(
        input='content',
        strip_accents='unicode',
        min_df=1,  # min_df=1 appropriate when using fasttext vectors
    )
    dtm = vec_unigrams.fit_transform(df['title'].values).toarray()
    # min_df=2 is applied in analysis step, need not be applied here
    return df, dtm, tuple(vec_unigrams.get_feature_names())


if __name__ == '__main__':
    print('running superficial checks')
    df, dtm, vocab = dataset()
    assert df is not None
    assert dtm is not None
    assert vocab is not None
    labels = (df['gender'] == 'Male').astype(int)
    chi2, pval = feature_selection.chi2(dtm, labels)
    assert chi2 is not None
    assert pval is not None
    print('done running superficial checks')
